Survey data is notoriously difficult to munge. Even when the data is recorded cleanly the options for ‘write in questions’, ‘choose from multiple answers’, ‘pick all that are right’, and ‘multiple choice questions’ makes storing the data in a tidy format difficult.
In 2014, FiveThirtyEight surveyed over 1000 people to write the article titled, America’s Favorite ‘Star Wars’ Movies (And Least Favorite Characters). They have provided the data on GitHub.
For this project, the goal is to validate the data provided on GitHub lines up with the articel be recreating at least two of the visuals and predict if a person from the survey makes at least $50k.
import randomimport polars as plimport numpy as npimport xgboost as xgbfrom lets_plot import*from sklearn.model_selection import train_test_split, GridSearchCVfrom sklearn.preprocessing import OneHotEncoder, LabelEncoderfrom sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifierfrom sklearn.metrics import ( classification_report, accuracy_score, recall_score, precision_score, f1_score )# add the additional libraries you need to import for ML hereLetsPlot.setup_html(isolated_frame=True)
Show the code
# import your data here using pandas and the URLurl ="https://github.com/fivethirtyeight/data/raw/master/star-wars-survey/StarWars.csv"df = pl.read_csv("StarWars.csv")df_clean = df.rename({ df.columns[1]: "seen", df.columns[2]: "fan",**{df.columns[i]: f"seen_epi_{['i', 'ii', 'iii', 'iv', 'v', 'vi'][i -3]}"for i inrange(3, 9)},**{df.columns[i]: f"rank_epi_{['i', 'ii', 'iii', 'iv', 'v', 'vi'][i -9]}"for i inrange(9, 15)},**{df.columns[i]: df[df.columns[i]][0] for i inrange(15, 29)}, df.columns[29]: "shot_first", df.columns[30]: "ex_uni", df.columns[31]: "fan_ex_uni", df.columns[32]: "fan_star_trek",**{df.columns[i]: df.columns[i].lower().replace(' ', '_') for i inrange(33, 37)}, df.columns[37]: "location"})df_clean = df_clean[1:]
Show the code
# Helper functions and shared listsseen_dict = {"seen_epi_i_yes": 1, "seen_epi_ii_yes": 2, "seen_epi_iii_yes": 3, "seen_epi_iv_yes": 4, "seen_epi_v_yes": 5, "seen_epi_vi_yes": 6}rank_dict = {"rank_epi_i": 1, "rank_epi_ii": 2, "rank_epi_iii": 3, "rank_epi_iv": 4, "rank_epi_v": 5, "rank_epi_vi": 6,}favor_cols = [df[df.columns[i]][0] for i inrange(15, 29)]shot_cols = ["shot_first_Greedo", "shot_first_Han", "shot_first_I don't understand this question"]movie_dict = {1: "The Phantom Menace", 2: "Attack of the Clones", 3: "Revenge of the Sith", 4: "A New Hope", 5: "The Empire Strikes Back", 6: "Return of the Jedi"}def GetSeen(data): data_seen = data.filter( pl.any_horizontal([pl.col(i) >0for i in seen_dict.keys()]) ).select(seen_dict.keys()) seen_percs = data_seen.unpivot(variable_name="movie", value_name="percentage").group_by("movie").agg(pl.col('percentage').sum() / data_seen.height) seen_percs = seen_percs.with_columns( ((pl.col("percentage") *100).round(0).cast(pl.Int64).cast(pl.String) +'%').alias('perc_label') )# Specifies the order seen_percs = seen_percs.with_columns( pl.col('movie').replace_strict(seen_dict, default=0).alias('movie_order') ).with_columns( pl.col('movie_order').replace_strict(movie_dict, default="0").alias('movie') ).sort('movie_order', descending=True)return seen_percsdef GetRanks(data): data_rank = data.filter( pl.all_horizontal([pl.col(i) >0for i in seen_dict.keys()]) ).select(rank_dict.keys())# Episode 3 has 1 missing rank (when compared to the other movies the rank option left is 6) data_rank = data_rank.with_columns(pl.col('rank_epi_iii').replace(0, 6)) rank_percs = data_rank.unpivot(variable_name="movie", value_name="percentage").group_by("movie").agg(pl.col('percentage').filter(pl.col('percentage') ==1).sum() / data_rank.height) rank_percs = rank_percs.with_columns( ((pl.col("percentage") *100).round(0).cast(pl.Int64).cast(pl.String) +'%').alias('perc_label') )# Specifies the order rank_percs = rank_percs.with_columns( pl.col('movie').replace_strict(rank_dict, default=0).alias('movie_order') ).with_columns( pl.col('movie_order').replace_strict(movie_dict, default="0").alias('movie') ).sort('movie_order', descending=True)return rank_percsdef GetRatings(data): data_ratings = data.filter( pl.all_horizontal([pl.col(i) >0for i in seen_dict.keys()]) ).select(rank_dict.keys())# data_ratings = data_ratings.with_columns(pl.col('rank_epi_iii').replace("0", "6")) data_ratings = data_ratings.with_columns(pl.col('rank_epi_iii').replace(0, 6)) thirds = data_ratings.with_columns( pl.col(i).replace_strict({1: 1,2: 1,3: 2,4: 2,5: 3,6: 3 }, default=0).alias(i) for i in data_ratings.columns ) third_counts = thirds.unpivot(variable_name="movie").group_by(['movie', 'value']).len().pivot('value', index='movie') third_percs = third_counts.with_columns( [(pl.col(third_counts.columns[i]) / thirds.height).alias(third_counts.columns[i]) for i inrange(1, 4)] ) third_percs = third_percs.with_columns(pl.col('movie').replace_strict(rank_dict, default=0).alias('movie_order')).with_columns(pl.col('movie_order').replace_strict(movie_dict, default="0").alias('movie')).sort('movie_order', descending=True).select(['movie', '1', '2', '3']) third_percs = third_percs.rename({"1": 'Top third', "2": 'Middle third', "3": 'Bottom third'}) third_long = third_percs.unpivot(index='movie', variable_name='rating', value_name='percentage') third_long = third_long.with_columns( ((pl.col("percentage") *100).round(0).cast(pl.Int64).cast(pl.String) +'%').alias('perc_label') )return third_longdef GetFavorability(data): data_favors = data.filter( pl.any_horizontal([pl.col(i) >0for i in seen_dict.keys()]) ).select(favor_cols) fourth = data_favors.with_columns( pl.col(i).replace_strict({0: 1,1: 1,2: 2,3: 2,4: 3,5: 4,6: 4 }, default=0).alias(i) for i in data_favors.columns ) fourth_counts = fourth.unpivot(variable_name="character").group_by(['character', 'value']).len().pivot('value', index='character').sort('4') fourth_percs = fourth_counts.with_columns( [(pl.col(fourth_counts.columns[i]) / fourth.height).alias(fourth_counts.columns[i]) for i inrange(1, 5)] ) fourth_percs = fourth_percs.rename({"1": 'Unfamiliar', "2": 'Unfavorable', "3": 'Neutral', "4": "Favorable"}) fourth_percs = fourth_percs.select(['character', 'Favorable', 'Neutral', 'Unfavorable', 'Unfamiliar']) fourth_long = fourth_percs.unpivot(index='character', variable_name='favor', value_name='percentage') fourth_long = fourth_long.with_columns( ((pl.col("percentage") *100).round(0).cast(pl.Int64).cast(pl.String) +'%').alias('perc_label') )return fourth_longdef GetShot(data): data_shot = data.filter( pl.any_horizontal([pl.col(i) >0for i in shot_cols]) ).select(shot_cols) shot_percs = data_shot.unpivot(variable_name="who", value_name="percentage").group_by("who").agg( pl.col('percentage').sum() / data_shot.height ).with_columns( pl.col("who").replace({"shot_first_Han": "Han","shot_first_Greedo": "Greedo","shot_first_I don't understand this question": "I don't understand\nthis question" }) ) shot_percs = shot_percs.with_columns( ((pl.col("percentage") *100).round(0).cast(pl.Int64).cast(pl.String) +'%').alias('perc_label') )# Designed to specify the order shot_percs = shot_percs.with_columns( pl.col('who').replace_strict({"Han": 2,"Greedo": 1 }, default=0).alias('who_order') ).sort('who_order')return shot_percsdef FindMdls(X_train, X_test, y_train, y_test): param_grid = {'n_estimators': [50, 100, 150, 200, 250],'learning_rate': [0.01, 0.05, 0.1, 0.2],'max_depth': [3, 4, 5, 6, 7],'subsample': [0.75, 0.8, 0.85, 0.9, 1.0] } gb = GradientBoostingClassifier(random_state=795) grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1) grid_search.fit(X_train, y_train)print(f"Best Parameters: ", grid_search.best_params_)print(f"Best Score: ", grid_search.best_score_)# grid_search does implement predict predictions = grid_search.predict(X_test)print(classification_report(y_test, predictions))def FindXGBMdls(X_features, y_targets): param_grid = {'n_estimators': [50, 100, 150, 200, 250],'learning_rate': [0.01, 0.05, 0.1, 0.2],'max_depth': [3, 4, 5, 6, 7],'subsample': [0.75, 0.8, 0.85, 0.9, 1.0] } rnd =795 gb = GradientBoostingClassifier( random_state=rnd, n_estimators=50 )# grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)# grid_search.fit(X_train, y_train)# print(f"Best Parameters: ", grid_search.best_params_)# print(f"Best Score: ", grid_search.best_score_)# predictions = grid_search.predict(X_test)# print(classification_report(y_test, predictions)) X_train, X_test, y_train, y_test = train_test_split(X_features, y_targets, random_state=rnd, test_size=0.2) gb.fit(X_train, y_train) preds = gb.predict(X_test) acc = accuracy_score(y_test, preds)# f1 = f1_score(y_test, preds)print(classification_report(y_test, preds)) best_mdl = gb best_rnd = rnd best_acc = acc# best_f1 = f1print(f"Current random number: {rnd} with accuracy of: {acc}") randos = np.random.randint(1, 1000, size=100)for i in randos:# rnd = random.randint(rand_min, rand_max) rnd_mdl = GradientBoostingClassifier( random_state=i, n_estimators=50 ) X_train, X_test, y_train, y_test = train_test_split(X_features, y_targets, random_state=i, test_size=0.2) rnd_mdl.fit(X_train, y_train) rnd_pred = rnd_mdl.predict(X_test) rnd_acc = accuracy_score(y_test, rnd_pred)# rnd_f1 = f1_score(y_test, rnd_pred)if (rnd_acc > best_acc): best_mdl = rnd_mdl best_acc = rnd_acc# best_f1 = rnd_f1 best_rnd = iprint(f"Current random number: {i} with accuracy of: {rnd_acc}")print(f"Best random number: {best_rnd} with accuracy of: {best_acc}")
Show the code
# Include and execute your code heredf_clean = df_clean.with_columns( pl.col('household_income').replace({"$150,000+": 2,"$100,000 - $149,999": 2,"$50,000 - $99,999": 2,"$25,000 - $49,999": 1,"$0 - $24,999": 1 }, default=0).alias("income"))df_clean = df_clean.drop(['household_income'])df_all = df_cleandf_clean = df_clean.with_columns( pl.col("age").replace_strict({"> 60": 4,"45-60": 3,"30-44": 2,"18-29": 1, }, default=0).alias("age"))df_clean = df_clean.with_columns( pl.col(df_clean.columns[i]).replace({"Very favorably": 6,"Somewhat favorably": 5,"Neither favorably nor unfavorably (neutral)": 4,"Somewhat unfavorably": 3,"Very unfavorably": 2,"Unfamiliar (N/A)": 1 }, default=0).alias(df_clean.columns[i]) for i inrange(15, 29))df_clean = df_clean.with_columns( pl.when((pl.col(df_clean.columns[i]).is_null())) .then(pl.lit("no")) .otherwise(pl.lit("yes")) .alias(df_clean.columns[i]) for i inrange(3, 9))df_clean = df_clean.with_columns( pl.col(i).fill_null(0) for i in rank_dict.keys())df_clean = df_clean.cast({'rank_epi_i': pl.Int64, 'rank_epi_ii': pl.Int64, 'rank_epi_iii': pl.Int64, 'rank_epi_iv': pl.Int64, 'rank_epi_v': pl.Int64, 'rank_epi_vi': pl.Int64})df_clean = df_clean.with_columns( pl.col('education').replace_strict({"Less than high school degree": 1.0,"High school degree": 1.5,"Some college or Associate degree": 2.0,"Bachelor degree": 4.0,"Graduate degree": 7.0 }, default=0.0).alias('education'))cat_cols = ['seen', 'fan', 'seen_epi_i', 'seen_epi_ii', 'seen_epi_iii', 'seen_epi_iv', 'seen_epi_v', 'seen_epi_vi', 'shot_first', 'ex_uni', 'fan_ex_uni', 'fan_star_trek', 'gender', 'location']all_code = df_all.columns[1:37]enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)enc_ar = enc.fit_transform(df_clean[cat_cols])enc_cols =list(enc.get_feature_names_out(cat_cols))enc_df = pl.DataFrame(enc_ar, schema=enc_cols)df_tot = pl.concat([df_clean.drop(cat_cols), enc_df], how='horizontal')enc_all = enc.fit_transform(df_all[all_code])enc_all_cols =list(enc.get_feature_names_out(all_code))enc_all_df = pl.DataFrame(enc_all, schema=enc_all_cols)X = df_tot.drop(['RespondentID', 'income'])y = df_tot.select('income')X_all = enc_all_dfy_all = df_all.select('income')print(f"Here is a data preview.")display(df_clean.head(7))
Here is a data preview.
shape: (7, 38)
RespondentID
seen
fan
seen_epi_i
seen_epi_ii
seen_epi_iii
seen_epi_iv
seen_epi_v
seen_epi_vi
rank_epi_i
rank_epi_ii
rank_epi_iii
rank_epi_iv
rank_epi_v
rank_epi_vi
Han Solo
Luke Skywalker
Princess Leia Organa
Anakin Skywalker
Obi Wan Kenobi
Emperor Palpatine
Darth Vader
Lando Calrissian
Boba Fett
C-3P0
R2 D2
Jar Jar Binks
Padme Amidala
Yoda
shot_first
ex_uni
fan_ex_uni
fan_star_trek
gender
age
education
location
income
i64
str
str
str
str
str
str
str
str
i64
i64
i64
i64
i64
i64
i64
i64
i64
i64
i64
i64
i64
i64
i64
i64
i64
i64
i64
i64
str
str
str
str
str
i64
f64
str
i64
3292879998
"Yes"
"Yes"
"yes"
"yes"
"yes"
"yes"
"yes"
"yes"
3
2
1
4
5
6
6
6
6
6
6
6
6
1
1
6
6
6
6
6
"I don't understand this questi…
"Yes"
"No"
"No"
"Male"
1
1.5
"South Atlantic"
0
3292879538
"No"
null
"no"
"no"
"no"
"no"
"no"
"no"
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
null
null
null
"Yes"
"Male"
1
4.0
"West South Central"
1
3292765271
"Yes"
"No"
"yes"
"yes"
"yes"
"no"
"no"
"no"
1
2
3
4
5
6
5
5
5
5
5
1
1
1
1
1
1
1
1
1
"I don't understand this questi…
"No"
null
"No"
"Male"
1
1.5
"West North Central"
1
3292763116
"Yes"
"Yes"
"yes"
"yes"
"yes"
"yes"
"yes"
"yes"
5
6
1
2
4
3
6
6
6
6
6
5
6
5
3
6
6
6
6
6
"I don't understand this questi…
"No"
null
"Yes"
"Male"
1
2.0
"West North Central"
2
3292731220
"Yes"
"Yes"
"yes"
"yes"
"yes"
"yes"
"yes"
"yes"
5
4
6
2
1
3
6
5
5
3
6
2
5
4
6
5
5
2
5
5
"Greedo"
"Yes"
"No"
"No"
"Male"
1
2.0
"West North Central"
2
3292719380
"Yes"
"Yes"
"yes"
"yes"
"yes"
"yes"
"yes"
"yes"
1
4
3
6
5
2
6
6
6
6
6
4
6
4
5
5
5
5
4
6
"Han"
"Yes"
"No"
"Yes"
"Male"
1
4.0
"Middle Atlantic"
1
3292684787
"Yes"
"Yes"
"yes"
"yes"
"yes"
"yes"
"yes"
"yes"
6
5
4
3
1
2
6
6
5
5
6
6
6
6
6
5
6
3
5
6
"Han"
"Yes"
"No"
"No"
"Male"
1
1.5
"East North Central"
0
Question 1
Build a machine learning model that predicts whether a person makes at least $50k with accuracy of at least 65%. Describe your model and report the accuracy.
The gradient boosting classifier model was used on the data. Limiting the n_estimators to 50 gave better results than increasing it, the accuracy of the model acheived 69%. Which isn’t all that good, but the data has a lot of missing values and the dataset of non missing values isn’t all that large.
Show the code
# Include and execute your code here# 693 sucks# 798, 343, 293, 841, 795rnd =795X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, random_state=rnd, test_size=0.2)model = GradientBoostingClassifier(random_state=rnd, n_estimators=50)model.fit(X_trn, y_trn)model = model.predict(X_tst)print(classification_report(y_tst, model))
Validate that the data provided on GitHub lines up with the article by recreating a 3rd visual from the article.
Using the data provided on GitHub, all 5 graphs from the article can be recreated. However, there are a few points of rounding error from some of the graphs but they ultimately refelct the results from the article.
Show the code
# Five charts:# Which 'Star Wars' Movies Have You Seen?df_seen = GetSeen(df_tot)movie_graph = ( ggplot(data=df_seen)+ geom_bar(mapping=aes(x='percentage', y='movie'), stat='identity', orientation='y', color="lightblue", fill="lightblue")+ labs( title="Which 'Star Wars' Movie's Have You Seen?", subtitle="Of 835 respondents who have seen any film", x='', y='' )+ scale_x_continuous(limits=[0,1])+ geom_text(aes(x='percentage', y='movie', label='perc_label'), nudge_x=0.075, size=12, color='black')+ theme( panel_background=element_rect(fill='gray', linetype=0), plot_background=element_rect(fill='gray'),# panel_grid_major=element_rect(fill='gray'), panel_grid=element_blank(), legend_background=element_rect(fill='gray'), axis_text=element_text(color='black', size=18),# axis_title=element_text(color='white'), plot_title=element_text(color='black', face="bold", hjust=0, size=25), plot_subtitle=element_text(color='black', hjust=0, size=20), legend_text=element_text(color='white'), legend_title=element_text(color='white'), label_text=element_text(color='white'), axis_line_x=element_blank(), axis_ticks_x=element_blank(), axis_text_x=element_blank(), plot_title_position='plot' )+ ggsize(1600, 900))display(movie_graph)# What's the Best 'Star Wars' Movie?df_ranks = GetRanks(df_tot)rank_graph = ( ggplot(data=df_ranks)+ geom_bar(mapping=aes(x='percentage', y='movie'), stat='identity', orientation='y', color="lightblue", fill="lightblue")+ labs( title="What's the Best 'Star Wars' Movie?", subtitle="Of 471 respondents who have seen all 6 films", x='', y='' )+ scale_x_continuous(limits=[0,0.4])+ geom_text(aes(x='percentage', y='movie', label='perc_label'), nudge_x=0.035, size=12, color='black')+ theme( panel_background=element_rect(fill='gray', linetype=0), plot_background=element_rect(fill='gray'), panel_grid=element_blank(), legend_background=element_rect(fill='gray'), axis_text=element_text(color='black', size=18), plot_title=element_text(color='black', face="bold", hjust=0, size=25), plot_subtitle=element_text(color='black', hjust=0, size=20), legend_text=element_text(color='white'), legend_title=element_text(color='white'), label_text=element_text(color='white'), axis_line_x=element_blank(), axis_ticks_x=element_blank(), axis_text_x=element_blank(), plot_title_position='plot' )+ ggsize(1600, 900))display(rank_graph)# How People Rate the 'Star Wars' Moviesdf_ratings = GetRatings(df_tot)rate_graph = ( ggplot(data=df_ratings)+ geom_bar(mapping=aes(x='percentage', y='movie', color='rating', fill='rating'), stat='identity', orientation='y', position='dodgev')+ facet_wrap(facets='rating', ncol=3, order=-1)+ guides(color='none', fill='none')+ labs( title="How People Rate the 'Star Wars' Movies", subtitle="How often each film was rated in the top, middle, and bottom third\n(by 471 respondents who have seen all six films)", x='', y='' )+ scale_x_continuous(limits=[0,1])+ geom_text(aes(x='percentage', y='movie', label='perc_label'), nudge_x=0.25, size=12, color='black')+ theme( panel_background=element_rect(fill='gray', linetype=0), plot_background=element_rect(fill='gray'), panel_grid=element_blank(), legend_background=element_rect(fill='gray'), axis_text=element_text(color='black', size=18), plot_title=element_text(color='black', face="bold", hjust=0, size=25), plot_subtitle=element_text(color='black', hjust=0, size=20), legend_text=element_text(color='white'), legend_title=element_text(color='white'), label_text=element_text(color='white'), axis_line_x=element_blank(), axis_ticks_x=element_blank(), axis_text_x=element_blank(), plot_title_position='plot', text=element_text(color='black'), strip_text=element_text(face='bold', size=20) )+ ggsize(1800, 900))display(rate_graph)# 'Star Wars' Character Favorability Ratingsdf_favors = GetFavorability(df_tot)favor_graph = ( ggplot(data=df_favors)+ geom_bar(mapping=aes(x='percentage', y='character', color='favor', fill='favor'), stat='identity', orientation='y', position='dodgev')+ facet_wrap(facets='favor', ncol=4, order=0)+ guides(color='none', fill='none')+ labs( title="'Star Wars' Character Favorability Ratings", subtitle="By 835 respondants", x='', y='' )+ scale_x_continuous(limits=[0,1.3])+ geom_text(aes(x='percentage', y='character', label='perc_label'), nudge_x=0.25, size=8, color='black')+ theme( panel_background=element_rect(fill='gray', linetype=0), plot_background=element_rect(fill='gray'), panel_grid=element_blank(), legend_background=element_rect(fill='gray'), axis_text=element_text(color='black', size=18), plot_title=element_text(color='black', face="bold", hjust=0, size=25), plot_subtitle=element_text(color='black', hjust=0, size=20), legend_text=element_text(color='white'), legend_title=element_text(color='white'), label_text=element_text(color='white'), axis_line_x=element_blank(), axis_ticks_x=element_blank(), axis_text_x=element_blank(), plot_title_position='plot', text=element_text(color='black'), strip_text=element_text(face='bold', size=20) )+ ggsize(1800, 1500))display(favor_graph)# Who Shot First?df_shot = GetShot(df_tot)shot_graph = ( ggplot(data=df_shot)+ geom_bar(mapping=aes(x='percentage', y='who'), stat='identity', orientation='y', color="lightblue", fill="lightblue")+ labs( title="Who Shot First?", subtitle="According to 828 respondents", x='', y='' )+ scale_x_continuous(limits=[0,0.45])+ geom_text(aes(x='percentage', y='who', label='perc_label'), nudge_x=0.05, size=12, color='black')+ theme( panel_background=element_rect(fill='gray', linetype=0), plot_background=element_rect(fill='gray'),# panel_grid_major=element_rect(fill='gray'), panel_grid=element_blank(), legend_background=element_rect(fill='gray'), axis_text=element_text(color='black', size=20),# axis_title=element_text(color='white'), plot_title=element_text(color='black', face="bold", hjust=0, size=25), plot_subtitle=element_text(color='black', hjust=0, size=20), legend_text=element_text(color='white'), legend_title=element_text(color='white'), label_text=element_text(color='white'), axis_line_x=element_blank(), axis_ticks_x=element_blank(), axis_text_x=element_blank(), plot_title_position='plot' )+ ggsize(1600, 900))display(shot_graph)